Round 1: Top-level cell type annotation

Can we distinguish cell types by surface proteins?

.hdr <- "result/step1/matrix_final"
.data <- fileset.list(.hdr)
.file <- "result/step2/prot_bbknn.rds"
if.needed(.file, {
    
    .batches <- .fread(.prot.data$col, col.names="tag")
    .batches[, c("barcode","batch") := tstrsplit(tag, "_")]

    .bbknn <- rcpp_mmutil_bbknn_pca(.prot.data$mtx,
                                    .batches$batch,
                                    knn = 50, RANK = 10,
                                    EM_ITER = 20,
                                    TAKE_LN = TRUE)
    saveRDS(.bbknn, .file)
})
.bbknn <- readRDS(.file)
  • nTconv : CD3+, CD4+, CD8-, CD25-/CD127+, CD45RA+/CD45RO-
  • mTconv : CD3+, CD4+, CD8-, CD25-/CD127+, CD45RA-/CD45RO+
  • nTreg : CD3+, CD4+, CD8-, CD25+/CD127-, CD45RA+/CD45RO-
  • mTreg : CD3+, CD4+, CD8-, CD25+/CD127-, CD45RA-/CD45RO+

Run annotation purely based on marker genes:

.file <- "result/step2/prot_annot.txt.gz"
if.needed(.file, {
    
    .pos.markers <- .read.marker.file("marker/surface_round1_positive.txt")
    .neg.markers <- .read.marker.file("marker/surface_round1_negative.txt")

    .annot.out <-
        rcpp_mmutil_annotate_columns(
            pos_labels = list(p1=.pos.markers),
            r_neg_labels = list(n1=.neg.markers),
            row_file = .prot.data$row,
            col_file = .prot.data$col,
            r_U = .bbknn$U,
            r_D = .bbknn$D,
            r_V = .bbknn$factors.adjusted,
            EM_TOL = 1e-6,
            EM_ITER = 500)
    annot.dt <- setDT(.annot.out$annotation)
    .col <- c("tag", "celltype", "prob", "ln.prob")
    names(annot.dt) <- .col
    annot.dt[, c("barcode","batch") := tstrsplit(tag, split="_")]
    fwrite(annot.dt, .file)
})
annot.dt <- fread(.file)

Two-dimensional density plot on the normalized CD marker expressions.

U <- .bbknn$U
D <- .bbknn$D
V <- .bbknn$factors.adjusted

prot.mtx <- pmax(exp(sweep(U, 2, D, `*`) %*% t(V)) - 1, 0)
prot.mtx <- apply(prot.mtx, 2, function(x) x/pmax(sum(x),1) * 1e4)

rownames(prot.mtx) <- readLines(.prot.data$row)
colnames(prot.mtx) <- readLines(.prot.data$col)

.ct <- c("mTreg","nTreg","mTconv","nTconv")
plt <- plt.scatter.ct.2(.ct, annot.dt, prot.mtx)
print(plt)

.file <- fig.dir %&% "/Fig_round1_cdmarker_bbknn.pdf"
.gg.save(filename = .file, plot = plt, width=6, height=5)

PDF

Two-dimensional density plot on the raw CD marker concentrations.

prot.raw.mtx <- read.dense(.prot.data$mtx)
rownames(prot.raw.mtx) <- readLines(.prot.data$row)
colnames(prot.raw.mtx) <- readLines(.prot.data$col)

.ct <- c("mTreg","nTreg","mTconv","nTconv")
plt <- plt.scatter.ct.2(.ct, annot.dt, prot.raw.mtx)
print(plt)

.file <- fig.dir %&% "/Fig_round1_cdmarker.pdf"
.gg.save(filename = .file, plot = plt, width=6, height=5)

PDF

Will the same classification results hold in transcriptomic data?

.file <- "result/step2/gene_bbknn.rds"

if.needed(.file, {

    .batches <- .fread(.gene.data$col, col.names="tag")
    .batches[, c("barcode","batch") := tstrsplit(tag, "_")]

    .bbknn <- rcpp_mmutil_bbknn_pca(.gene.data$mtx,
                                    .batches$batch,
                                    knn = 50, RANK = 50,
                                    EM_ITER = 20,
                                    TAKE_LN = TRUE,
                                    NUM_THREADS = 8)
    saveRDS(.bbknn, .file)
})
.bbknn <- readRDS(.file)
degree.cutoff <- function(G, .cutoff = 3) {
    G.sub <- G
    n.remove <- sum(igraph::degree(G.sub) < .cutoff)
    while(n.remove > 0){
        vv <- igraph::V(G.sub)
        .retain <- vv[igraph::degree(G.sub) >= .cutoff]
        G.sub <- igraph::induced_subgraph(G.sub, .retain)
        n.remove <- sum(igraph::degree(G.sub) < .cutoff)
    }
    return(G.sub)
}

comp.cutoff <- function(G, .cutoff = 10){
    .comp <- igraph::components(G)
    .kk <- which(.comp$csize >= .cutoff) # valid component(s)
    .valid <- which(.comp$membership %in% .kk) # valid nodes
    G <- igraph::induced_subgraph(G, igraph::V(G)[.valid])
    return(G)
}

run.bbknn.umap <- function(.bbknn, .cells, res = .3, ...){

    .df <- data.frame(from = .bbknn$knn$src.index,
                      to = .bbknn$knn$tgt.index,
                      weight = .bbknn$knn$weight)

    G <- igraph::graph_from_data_frame(.df, directed=FALSE)
    G <- comp.cutoff(G, 10)
    G <- degree.cutoff(G, 3)

    nrepeat <- 20
    C <- list(modularity = 0)
    for(r in 1:nrepeat){
        c.r <- igraph::cluster_louvain(G, resolution = res)
        message(paste("modularity: ", c.r$modularity, "\n"))
        if(r == 1 || max(c.r$modularity) > max(C$modularity)){
            C <- c.r
        }
    }

    .clust <- data.table(tag = .cells[as.integer(C$names)],
                         membership = C$membership)

    A <- igraph::as_adj(G, attr="weight")

    umap.A <- uwot::optimize_graph_layout(A, verbose = TRUE,
                                          n_sgd_threads = "auto",
                                          ...)

    .dt <- data.table(tag = .cells[as.integer(rownames(A))],
                      umap1 = umap.A[,1],
                      umap2 = umap.A[,2]) %>% 
        left_join(.clust, by = "tag")
}
  1. Build a batch-balancing kNN graph

  2. Louvain graph-based clustering

  3. Construct UMAP the kNN backbone graph

.file <- "result/step2/gene_bbknn_louvain.txt.gz"
.cells <- readLines(.gene.data$col)
if.needed(.file, {
    .bbknn.umap <- run.bbknn.umap(.bbknn, .cells)
    fwrite(.bbknn.umap, .file)
})

.bbknn.umap <- fread(.file) %>%
    left_join(annot.dt, by = "tag") %>% 
    mutate(membership = as.factor(membership))

## remove small clusters for visualization
.size <- .bbknn.umap[, .(.N), by = .(membership)]
.bbknn.umap <-
    .size[N > 100, .(membership)] %>%
    left_join(.bbknn.umap, by = "membership")

Clustering patterns in scRNA-seq agrees with the classification results based on surface proteins

p1 <-
    .gg.plot(.bbknn.umap, aes(umap1, umap2, color=membership)) +
    ggrastr::rasterise(geom_point(stroke=0, alpha=.8, size=.7), dpi=300)

p2 <-
    .gg.plot(.bbknn.umap, aes(umap1, umap2, color=celltype)) +
    ggrastr::rasterise(geom_point(stroke=0, alpha=.8, size=.7), dpi=300) +
    scale_color_brewer(palette = "Paired")

plt <- p1 | p2
print(plt)

.file <- fig.dir %&% "/Fig_bbknn_gene_only_umap.pdf"
.gg.save(filename = .file, plot = plt, width=8, height=3)

PDF

.ct.mem <- .bbknn.umap[, .(nc = .N), by = .(celltype, membership)]
.ct.mem[, ntot := sum(`nc`), by = .(membership)]
.ct.mem[, pr := `nc`/`ntot`]

.df <- 
    .ct.mem %>%
    mutate(row = celltype, col = membership,
           weight = logit(pmax(pmin(pr, 1-1e-2), 1e-2))) %>% 
    order.pair(ret.tab = TRUE)
plt <- 
    .gg.plot(.df, aes(col, row, fill=weight, size = `nc`)) +
    xlab("Louvain clustering") +
    ylab("cell type") +
    theme(legend.position = "bottom") +
    geom_point(pch=22) +
    scale_size_continuous("#cells", range=c(0,5)) +
    scale_fill_distiller("proportion",
                         palette="PuRd", direction = 1,
                         labels=function(x) round(sigmoid(x), 2))
print(plt)

.file <- fig.dir %&% "/Fig_bbknn_gene_only_cluster.pdf"
.gg.save(filename = .file, plot = plt, width=5, height=2)

PDF

Joint clustering analysis

.file <- "result/step2/full_bbknn.rds"

if.needed(.file, {

    .batches <- .fread(.data$col, col.names="tag")
    .batches[, c("barcode","batch") := tstrsplit(tag, "_")]

    .bbknn <- rcpp_mmutil_bbknn_pca(.data$mtx,
                                    .batches$batch,
                                    knn = 50, RANK = 50,
                                    EM_ITER = 20,
                                    TAKE_LN = TRUE,
                                    NUM_THREADS = 8)
    saveRDS(.bbknn, .file)
})
.bbknn <- readRDS(.file)

1. Run annotation on the joint BBKNN data

.file <- "result/step2/prot_annot_full.txt.gz"
if.needed(.file, {
    
    .pos.markers <- .read.marker.file("marker/surface_round1_positive.txt")
    .neg.markers <- .read.marker.file("marker/surface_round1_negative.txt")

    .annot.out <-
        rcpp_mmutil_annotate_columns(
            pos_labels = list(p1=.pos.markers),
            r_neg_labels = list(n1=.neg.markers),
            row_file = .data$row,
            col_file = .data$col,
            r_U = .bbknn$U,
            r_D = .bbknn$D,
            r_V = .bbknn$factors.adjusted,
            EM_TOL = 1e-6,
            EM_ITER = 500)
    annot.dt <- setDT(.annot.out$annotation)
    .col <- c("tag", "celltype", "prob", "ln.prob")
    names(annot.dt) <- .col
    annot.dt[, c("barcode","batch") := tstrsplit(tag, split="_")]
    fwrite(annot.dt, .file)
})
annot.dt <- fread(.file)

Two-dimensional density plot on the raw CD marker concentrations.

prot.raw.mtx <- read.dense(.prot.data$mtx)
rownames(prot.raw.mtx) <- readLines(.prot.data$row)
colnames(prot.raw.mtx) <- readLines(.prot.data$col)

.ct <- c("mTreg","nTreg","mTconv","nTconv")
plt <- plt.scatter.ct.2(.ct, annot.dt, prot.raw.mtx)
print(plt)

.file <- fig.dir %&% "/Fig_round1_cdmarker_full.pdf"
.gg.save(filename = .file, plot = plt, width=6, height=5)

PDF

2. Graph-based clustering

.file <- "result/step2/full_bbknn_louvain.txt.gz"
.cells <- readLines(.data$col)
if.needed(.file, {
    .bbknn.umap <- run.bbknn.umap(.bbknn, .cells)
    fwrite(.bbknn.umap, .file)
})

.bbknn.umap <- fread(.file) %>%
    left_join(annot.dt, by = "tag") %>% 
    mutate(membership = as.factor(membership))

## remove small clusters for visualization
.size <- .bbknn.umap[, .(.N), by = .(membership)]
.bbknn.umap <-
    .size[N > 100, .(membership)] %>%
    left_join(.bbknn.umap, by = "membership")
p1 <-
    .gg.plot(.bbknn.umap, aes(umap1, umap2, color=membership)) +
    ggrastr::rasterise(geom_point(stroke=0, alpha=.8, size=.7), dpi=300)

p2 <-
    .gg.plot(.bbknn.umap, aes(umap1, umap2, color=celltype)) +
    ggrastr::rasterise(geom_point(stroke=0, alpha=.8, size=.7), dpi=300) +
    scale_color_brewer(palette = "Paired")

plt <- p1 | p2
print(plt)

.file <- fig.dir %&% "/Fig_bbknn_full_umap.pdf"
.gg.save(filename = .file, plot = plt, width=8, height=3)

PDF

.ct.mem <- .bbknn.umap[, .(nc = .N), by = .(celltype, membership)]
.ct.mem[, ntot := sum(`nc`), by = .(membership)]
.ct.mem[, pr := `nc`/`ntot`]

.df <- 
    .ct.mem %>%
    mutate(row = celltype, col = membership,
           weight = logit(pmax(pmin(pr, 1-1e-2), 1e-2))) %>% 
    order.pair(ret.tab = TRUE)

plt <-
    .gg.plot(.df, aes(col, row, fill=weight, size = `nc`)) +
    xlab("Louvain clustering") +
    ylab("cell type") +
    theme(legend.position = "bottom") +
    geom_point(pch=22) +
    scale_size_continuous("#cells", range=c(0,5)) +
    scale_fill_distiller("proportion",
                         palette="PuRd", direction = 1,
                         labels=function(x) round(sigmoid(x), 2))
print(plt)

.file <- fig.dir %&% "/Fig_bbknn_full_cluster.pdf"
.gg.save(filename = .file, plot = plt, width=5, height=2)

PDF

3. Refine cell type assignments with the clustering results

.valid.clust <- .ct.mem[order(pr, decreasing = T),
                        head(.SD, 1),
                        by = .(membership)]

.valid.celltype.cluster <-
    .valid.clust[pr > .5, .(membership, celltype)] %>%
    unique()

celltype.final <- 
    .bbknn.umap %>%
    rename(celltype.protein = celltype) %>%
    (function(x) left_join(.valid.celltype.cluster, x)) %>%
    filter(celltype.protein == celltype) %>%
    na.omit() %>%
    as.data.table

DOWNLOAD: Cell type estimation

p1 <-
    .gg.plot(celltype.final, aes(umap1, umap2, color=membership)) +
    ggrastr::rasterise(geom_point(stroke=0, alpha=.8, size=.7), dpi=300)

p2 <-
    .gg.plot(celltype.final, aes(umap1, umap2, color=celltype)) +
    ggrastr::rasterise(geom_point(stroke=0, alpha=.8, size=.7), dpi=300) +
    scale_color_brewer(palette = "Paired")

plt <- p1 | p2
print(plt)

.file <- fig.dir %&% "/Fig_bbknn_final_umap.pdf"
.gg.save(filename = .file, plot = plt, width=8, height=3)

PDF

4. Confirm by other marker genes/proteins

.markers <-
    c("FOXP3", "ID3", "BACH2", "CXCR3", "PRDM1", "SGK1", "TCF7", "LEF1",
      "SELL", "IL2RA", "IL7R", "IKZF2", "CCR6", "CCR4", "CCR7", "CTLA4",
      "HLA-DRA", "anti_CD25", "anti_CD127", "anti_CD183", "anti_CD196",
      "anti_CD197", "anti_CD194", "anti_CD45RA", "anti_CD45RO",
      "anti_HLA") %>%
    unique
.marker.hdr <- "result/step2/marker/marker"
.mkdir(dirname(.marker.hdr))
.marker.data <- fileset.list(.marker.hdr)
if.needed(.marker.data, {
    .marker.data <-
        rcpp_mmutil_copy_selected_rows(.data$mtx,
                                       .data$row,
                                       .data$col,
                                       .markers,
                                       .marker.hdr)
})

Note: these plots show expression values without batch adjustment

.markers <- sort(as.character(unique(marker.dt$marker)))
for(g in .markers){
    plt <- plot.marker.umap(g)
    print(plt)
    .file <- fig.dir %&% "/Fig_marker_" %&% g %&% "_umap.pdf"
    .gg.save(filename = .file, plot = plt, width=3.2, height=2.8)
}

PDF

PDF

PDF

PDF

PDF

PDF

PDF

PDF

PDF

PDF

PDF

PDF

PDF

PDF

PDF

PDF

PDF

PDF

PDF

PDF

PDF

PDF

PDF

PDF

PDF

PDF

Basic statistics for the first round annotation (16,053 cells)

.hash.info <- fread("data/cell_info_batch3-7.csv.gz") %>%
    rename(tag = cbid, subject = id) %>%
    select(-`V1`) %>%
    mutate(disease = substr(`subject`, 1, 2)) %>% 
    mutate(hash = gsub(pattern="[a-zA-Z\\-]+", replacement="", `hash`))

.sample.info <-
    readxl::read_xlsx("data/Hashing list MS Treg project.xlsx", 1) %>% 
    rename(Sample = `Cell type`) %>%
    mutate(hash = gsub("#","",`hash`))

.dt <-
    celltype.final %>%
    mutate(batch=as.integer(batch)) %>%
    left_join(.hash.info) %>%
    mutate(ct=factor(celltype, c("nTconv","mTconv","nTreg","mTreg"))) %>% 
    left_join(.sample.info)

Combining all the experiments

PDF

PDF

Total CD4 samples

PDF

PDF

CD25 enriched samples

PDF

PDF